In [1]:
#################################################################################
#####   INF889E - Méthodes d'intelligence artificielle en bioinformatique   #####
#####             Classification de VIH par zones géographiques             #####
#################################################################################
#####   Author: Riccardo Z******                                            #####
#####   This program is partly inspired by the work presented in a class    #####
#####   workshop by Dylan Lebatteux.                                        #####
#################################################################################
In [2]:
# Import functions
import re
import joblib
import numpy as np
import pandas as pd
from os import listdir
from random import shuffle
from progressbar import ProgressBar
from Bio import SeqIO, pairwise2
from Bio.motifs import create
from sklearn import svm
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from dna_features_viewer import GraphicFeature, GraphicRecord
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
##############################################
#####        IMPORTANT VARIABLES         #####
##############################################
In [4]:
# Scope of classification: if "ALL", classify by region globaly
# If AFR, ASI, CAM, CRB, EUR, FSU, MEA, NAM, OCE or SAM, classify by country within this chosen region 
scope = "ALL"
# Access path for the FASTA files (one file for each region)
path = "../../../../data/" + "complete"
# Name of trained model when saving
model_name = "k-mers-6.pkl"
# For sampling purposes: will process max n sequences for each target class
n_samples = 250000
# Classification features as sum (false) of motifs or as frequency (true) of motifs
freq = False
# Elimination step for features selection
step = 5
# Number of features to select
n_features = 100
# Train / Test split ratio
split_raito = 0.8
# Dimensions for graphs (2D or 3D)
n_components = 2
# Set maximum number of incorrect records to analyse at the end
max_incorrect = 10
# Set maximum number of correct records to compute alignment with at the end
max_correct = 1000
# Set the length k of the features based on k-mers
k = 6
In [5]:
##############################################
#####        DATA INITIALISATION         #####
##############################################
print("\n         DATA INITIALISATION         ")
print("=====================================")
         DATA INITIALISATION         
=====================================
In [6]:
# Will contain all the data rows, in the form of biopython seq_records objects
data = []
# Will contain a pair of target class -> number of data rows with this target class
targets = {}
# Process raw record label information into its annotations, then insert it into data
# To update if the label of sequences in the FASTA files changes
def add_record(record, target):
    # Initialiation of the seq_record
    header = record.id.split(".")
    record.id = header[4]
    record.name = header[3]
    record.seq = record.seq.upper()
    record.annotations = {"target": target, "subtype": header[0], "country": header[1]}
    # Add it to the data table and update the target classes dictionary
    targets[target] = targets.get(target, 0) + 1
    data.append(record)
In [7]:
# Properly fills the data table using the above function
if scope == "ALL":
    # Used to show progress
    progress = ProgressBar()
    # If scope is ALL, each filename is the name of each region used as a target class
    for filename in progress(listdir(path)):
        target = filename.split('.')[0]
        for record in SeqIO.parse(path + "/" + filename, "fasta"):
            add_record(record, target)
    print("")
else:
    # Else, countries are target classes, and the scope region is the filename
    for record in SeqIO.parse(path + "/" + scope + ".fasta", "fasta"):
        target = record.id.split(".")[1]
        add_record(record, target)
100% (9 of 9) |##########################| Elapsed Time: 0:00:02 Time:  0:00:02

In [8]:
# Dipslay data information
print("Data information:")
print("Number of sequences:", sum(targets.values()))
print("Number of targets:", len(targets))
print("Minimum number of instances:", min(targets.values()))
print("Maximum number of instances:", max(targets.values()))

# Dipslay data summary
print("\nData summary:")
for key, value in targets.items(): 
    print("Target:", key, "| Number of sequences:", value)

# Display the first 5 samples
print("\nInformation of the first 5 samples:")
for i in range(5):
    print("ID:", data[i].id, "| Sequence:", data[i].seq[0:50], "| Annotations:", data[i].annotations)
Data information:
Number of sequences: 13916
Number of targets: 9
Minimum number of instances: 40
Maximum number of instances: 5223

Data summary:
Target: AFR | Number of sequences: 3377
Target: ASI | Number of sequences: 2559
Target: CRB | Number of sequences: 40
Target: EUR | Number of sequences: 1783
Target: FUS | Number of sequences: 235
Target: MEA | Number of sequences: 41
Target: NAM | Number of sequences: 5223
Target: OCE | Number of sequences: 40
Target: SAM | Number of sequences: 618

Information of the first 5 samples:
ID: AB049811 | Sequence: TGGATGGGCTAATTTACTCCAAGAAAAGACAAGAGATCCTTGATCTGTGG | Annotations: {'target': 'AFR', 'subtype': '02_AG', 'country': 'GH'}
ID: AB052867 | Sequence: TGGAAGGGTTAATTCATTCCCAGAAAAGACAAGACATCCTTGATCTGTGG | Annotations: {'target': 'AFR', 'subtype': '02A1', 'country': 'GH'}
ID: AB098330 | Sequence: TGGATGGGTTAATTTACTCCAGGAAAAGACAGGAAATCCTTGATCTGTGG | Annotations: {'target': 'AFR', 'subtype': 'A1', 'country': 'UG'}
ID: AB098331 | Sequence: TGGATGGGTTAATTTACTCCAGGAAAAGACAGGAAATCCTTGATCTGTGG | Annotations: {'target': 'AFR', 'subtype': 'A1', 'country': 'UG'}
ID: AB098332 | Sequence: TGGATGGGTTAATTTACTCCAGGAAAAGACAAGAAATCCTTGATCTGTGG | Annotations: {'target': 'AFR', 'subtype': 'A1', 'country': 'UG'}
In [9]:
##############################################
#####      TRAIN / TEST DATA SPLIT       #####
##############################################
# Initialise train/test tables that will contain the data
train_data = []
test_data = []
# Initialise train/test dictionaries that will contain the number of instances for each target
test_split = {}
train_split = {}
# Initialise the dictionary with the targets keys and the value 0
test_split = test_split.fromkeys(targets.keys(), 0)
train_split = train_split.fromkeys(targets.keys(), 0)
# Shuffle the data
shuffle(data)
In [10]:
# Iterate through the data
for d in data:
    # Get this records's target class
    target = d.annotations["target"]
    # For sampling purposes: train/test threshold is based on n_samples if there is too much records for this target
    threshold = min(targets[target], n_samples) * split_raito
    # Until threshold for this target is reached, fills train data
    if train_split[target] < threshold: 
        train_data.append(d)
        train_split[target] += 1
    # Then, fills test data (until eventually n_samples are collected)
    elif test_split[target] < n_samples * (1-split_raito): 
        test_data.append(d)
        test_split[target] += 1
# Shuffle the data
shuffle(train_data)
shuffle(test_data)
In [11]:
# Data summary of the train/test split
print("\nTrain/Test split summary:")
for train_key, test_key in zip(train_split.keys(), test_split.keys()):
    print("Target:", train_key, "| Train instances:", train_split[train_key], "| Test instances:", test_split[test_key])
print("\nTotal number of training instances:", len(train_data))
print("Total number of testing instances:", len(test_data))
Train/Test split summary:
Target: AFR | Train instances: 2702 | Test instances: 675
Target: ASI | Train instances: 2048 | Test instances: 511
Target: CRB | Train instances: 32 | Test instances: 8
Target: EUR | Train instances: 1427 | Test instances: 356
Target: FUS | Train instances: 188 | Test instances: 47
Target: MEA | Train instances: 33 | Test instances: 8
Target: NAM | Train instances: 4179 | Test instances: 1044
Target: OCE | Train instances: 32 | Test instances: 8
Target: SAM | Train instances: 495 | Test instances: 123

Total number of training instances: 11136
Total number of testing instances: 2780
In [12]:
##################################################
#####  FEATURES GENERATION BASED ON K-MERS   #####
##################################################
print("\n         FEATURES GENERATION         ")
print("=====================================")
         FEATURES GENERATION         
=====================================
In [13]:
# Initialize an empty dictionary for the k-mers motifs features
instances = {}
# Used to show progress
progress = ProgressBar()
# Iterate through the training data
for d in train_data:
    # Go through the sequence 
    for i in range(0, len(d.seq) - k + 1, 1):
        # Get the current k-mer motif feature
        feature = str(d.seq[i:i + k])
        # If it contains only the characters "A", "C", "G" or "T", it will be saved
        if re.match('^[ACGT]+$', feature): 
            instances[feature] = 0
    progress.update(len(instances))
    # No need to keep going if motifs dictonary reaches max size
    if len(instances) == 4 ** k:
        break
# Used to show progress
progress.finish()
# Save dictonary keys as biopython motifs object
motifs = create(instances.keys())
# Display the number of features
print("\nNumber of features:", len(motifs.instances), "\n")
| |             #                                  | 4096 Elapsed Time: 0:00:27

Number of features: 4096 

In [14]:
######################################################################
##### GENERATION OF THE FEATURE MATRIX (x) AND TARGET VECTOR (y) #####
######################################################################
In [15]:
# Function to generate feature matrix and target vector
def generateFeatures(data):
    # Initialize the feature matrix
    X = []
    # Initialize the target vector
    y = []
    # Used to show progress
    progress = ProgressBar()
    # Iterate through the data
    for d in progress(data):
        # Generate an empty dictionary
        x = {}
        # Initialize the dictionary with targets as keys and 0 as value
        x = x.fromkeys(motifs.instances, 0)
        # Compute X (features matrix): the number of occurrence of k-mers (with overlaping)
        for i in range(0, len(d.seq) - k + 1, 1):
            feature = d.seq[i:i + k]
            # Attempt to increment the number of occurrences of the current k-mer feature
            try: x[feature] += 1
            # It could fail because the current k-mer is not full ACGT
            except: pass
        # Save the features vector in the features matrix
        X.append(list(x.values()))
        # Save the target class in the target vector
        y.append(d.annotations["target"])
    # Return matrices X and y (feature matrix and target vector)
    return X, y
In [16]:
# Generate train/test feature matrices and target vectors
x_train, y_train = generateFeatures(train_data)
x_test, y_test = generateFeatures(test_data)
100% (11136 of 11136) |##################| Elapsed Time: 0:07:07 Time:  0:07:07
100% (2780 of 2780) |####################| Elapsed Time: 0:01:47 Time:  0:01:47
In [17]:
# Function to generate feature matrix and target vector based on k-mer frequency, not the sum
def generateFreqFeatures(x_sum):
    X = []
    for x in x_sum:
        total = sum(x)
        X.append(list(map((lambda i: i / total), x)))
    return X
In [18]:
# If Freq is ture, then the features matrix are frequency of k-mers, not their sum
if freq:
    x_train = generateFreqFeatures(x_train)
    x_test = generateFreqFeatures(x_test)
In [19]:
##############################################
#####       FEATURES NORMALISATION       #####
##############################################
In [20]:
# Instantiate a MinMaxScaler between 0 and 1
minMaxScaler = MinMaxScaler(feature_range = (0,1))
# Apply a scaling to the train and test set
x_train = minMaxScaler.fit_transform(x_train)
x_test = minMaxScaler.fit_transform(x_test)
In [21]:
##############################################
#####         FEATURES SELECTION         #####
##############################################
print("\n         FEATURES SELECTION          ")
print("=====================================")
         FEATURES SELECTION          
=====================================
In [22]:
# Instantiate a linear model based on svm
model = svm.SVC(C = 1.0, kernel='linear', class_weight = None)
# Instantiate the RFE
rfe = RFE(model, n_features_to_select = n_features, step = step, verbose=True)
# Apply RFE and transform the training matrix
x_train = rfe.fit_transform(x_train, y_train)
# Tranform the test matrix (will be useed later for evaluation purposes)
x_test = rfe.transform(x_test)
.
Fitting estimator with 2751 features.
Fitting estimator with 2746 features.
Fitting estimator with 2741 features.
Fitting estimator with 2736 features.
Fitting estimator with 2731 features.
Fitting estimator with 2726 features.
Fitting estimator with 2721 features.
Fitting estimator with 2716 features.
Fitting estimator with 2711 features.
Fitting estimator with 2706 features.
Fitting estimator with 2701 features.
Fitting estimator with 2696 features.
Fitting estimator with 2691 features.
Fitting estimator with 2686 features.
Fitting estimator with 2681 features.
Fitting estimator with 2676 features.
Fitting estimator with 2671 features.
Fitting estimator with 2666 features.
Fitting estimator with 2661 features.
Fitting estimator with 2656 features.
Fitting estimator with 2651 features.
Fitting estimator with 2646 features.
Fitting estimator with 2641 features.
Fitting estimator with 2636 features.
Fitting estimator with 2631 features.
Fitting estimator with 2626 features.
Fitting estimator with 2621 features.
Fitting estimator with 2616 features.
Fitting estimator with 2611 features.
Fitting estimator with 2606 features.
Fitting estimator with 2601 features.
Fitting estimator with 2596 features.
Fitting estimator with 2591 features.
Fitting estimator with 2586 features.
Fitting estimator with 2581 features.
Fitting estimator with 2576 features.
Fitting estimator with 2571 features.
Fitting estimator with 2566 features.
Fitting estimator with 2561 features.
Fitting estimator with 2556 features.
Fitting estimator with 2551 features.
Fitting estimator with 2546 features.
Fitting estimator with 2541 features.
Fitting estimator with 2536 features.
Fitting estimator with 2531 features.
Fitting estimator with 2526 features.
Fitting estimator with 2521 features.
Fitting estimator with 2516 features.
Fitting estimator with 2511 features.
Fitting estimator with 2506 features.
Fitting estimator with 2501 features.
Fitting estimator with 2496 features.
Fitting estimator with 2491 features.
Fitting estimator with 2486 features.
Fitting estimator with 2481 features.
Fitting estimator with 2476 features.
Fitting estimator with 2471 features.
Fitting estimator with 2466 features.
Fitting estimator with 2461 features.
Fitting estimator with 2456 features.
Fitting estimator with 2451 features.
Fitting estimator with 2446 features.
Fitting estimator with 2441 features.
Fitting estimator with 2436 features.
Fitting estimator with 2431 features.
Fitting estimator with 2426 features.
Fitting estimator with 2421 features.
Fitting estimator with 2416 features.
Fitting estimator with 2411 features.
Fitting estimator with 2406 features.
Fitting estimator with 2401 features.
Fitting estimator with 2396 features.
Fitting estimator with 2391 features.
Fitting estimator with 2386 features.
Fitting estimator with 2381 features.
Fitting estimator with 2376 features.
Fitting estimator with 2371 features.
Fitting estimator with 2366 features.
Fitting estimator with 2361 features.
Fitting estimator with 2356 features.
Fitting estimator with 2351 features.
Fitting estimator with 2346 features.
Fitting estimator with 2341 features.
Fitting estimator with 2336 features.
Fitting estimator with 2331 features.
Fitting estimator with 2326 features.
Fitting estimator with 2321 features.
Fitting estimator with 2316 features.
Fitting estimator with 2311 features.
Fitting estimator with 2306 features.
Fitting estimator with 2301 features.
Fitting estimator with 2296 features.
Fitting estimator with 2291 features.
Fitting estimator with 2286 features.
Fitting estimator with 2281 features.
Fitting estimator with 2276 features.
Fitting estimator with 2271 features.
Fitting estimator with 2266 features.
Fitting estimator with 2261 features.
Fitting estimator with 2256 features.
Fitting estimator with 2251 features.
Fitting estimator with 2246 features.
Fitting estimator with 2241 features.
Fitting estimator with 2236 features.
Fitting estimator with 2231 features.
Fitting estimator with 2226 features.
Fitting estimator with 2221 features.
Fitting estimator with 2216 features.
Fitting estimator with 2211 features.
Fitting estimator with 2206 features.
Fitting estimator with 2201 features.
Fitting estimator with 2196 features.
Fitting estimator with 2191 features.
Fitting estimator with 2186 features.
Fitting estimator with 2181 features.
Fitting estimator with 2176 features.
Fitting estimator with 2171 features.
Fitting estimator with 2166 features.
Fitting estimator with 2161 features.
Fitting estimator with 2156 features.
Fitting estimator with 2151 features.
Fitting estimator with 2146 features.
Fitting estimator with 2141 features.
Fitting estimator with 2136 features.
Fitting estimator with 2131 features.
Fitting estimator with 2126 features.
Fitting estimator with 2121 features.
Fitting estimator with 2116 features.
Fitting estimator with 2111 features.
Fitting estimator with 2106 features.
Fitting estimator with 2101 features.
Fitting estimator with 2096 features.
Fitting estimator with 2091 features.
Fitting estimator with 2086 features.
Fitting estimator with 2081 features.
Fitting estimator with 2076 features.
Fitting estimator with 2071 features.
Fitting estimator with 2066 features.
Fitting estimator with 2061 features.
Fitting estimator with 2056 features.
Fitting estimator with 2051 features.
Fitting estimator with 2046 features.
Fitting estimator with 2041 features.
Fitting estimator with 2036 features.
Fitting estimator with 2031 features.
Fitting estimator with 2026 features.
Fitting estimator with 2021 features.
Fitting estimator with 2016 features.
Fitting estimator with 2011 features.
Fitting estimator with 2006 features.
Fitting estimator with 2001 features.
Fitting estimator with 1996 features.
Fitting estimator with 1991 features.
Fitting estimator with 1986 features.
Fitting estimator with 1981 features.
Fitting estimator with 1976 features.
Fitting estimator with 1971 features.
Fitting estimator with 1966 features.
Fitting estimator with 1961 features.
Fitting estimator with 1956 features.
Fitting estimator with 1951 features.
Fitting estimator with 1946 features.
Fitting estimator with 1941 features.
Fitting estimator with 1936 features.
Fitting estimator with 1931 features.
Fitting estimator with 1926 features.
Fitting estimator with 1921 features.
Fitting estimator with 1916 features.
Fitting estimator with 1911 features.
Fitting estimator with 1906 features.
Fitting estimator with 1901 features.
Fitting estimator with 1896 features.
Fitting estimator with 1891 features.
Fitting estimator with 1886 features.
Fitting estimator with 1881 features.
Fitting estimator with 1876 features.
Fitting estimator with 1871 features.
Fitting estimator with 1866 features.
Fitting estimator with 1861 features.
Fitting estimator with 1856 features.
Fitting estimator with 1851 features.
Fitting estimator with 1846 features.
Fitting estimator with 1841 features.
Fitting estimator with 1836 features.
Fitting estimator with 1831 features.
Fitting estimator with 1826 features.
Fitting estimator with 1821 features.
Fitting estimator with 1816 features.
Fitting estimator with 1811 features.
Fitting estimator with 1806 features.
Fitting estimator with 1801 features.
Fitting estimator with 1796 features.
Fitting estimator with 1791 features.
Fitting estimator with 1786 features.
Fitting estimator with 1781 features.
Fitting estimator with 1776 features.
Fitting estimator with 1771 features.
Fitting estimator with 1766 features.
Fitting estimator with 1761 features.
Fitting estimator with 1756 features.
Fitting estimator with 1751 features.
Fitting estimator with 1746 features.
Fitting estimator with 1741 features.
Fitting estimator with 1736 features.
Fitting estimator with 1731 features.
Fitting estimator with 1726 features.
Fitting estimator with 1721 features.
Fitting estimator with 1716 features.
Fitting estimator with 1711 features.
Fitting estimator with 1706 features.
Fitting estimator with 1701 features.
Fitting estimator with 1696 features.
Fitting estimator with 1691 features.
Fitting estimator with 1686 features.
Fitting estimator with 1681 features.
Fitting estimator with 1676 features.
Fitting estimator with 1671 features.
Fitting estimator with 1666 features.
Fitting estimator with 1661 features.
Fitting estimator with 1656 features.
Fitting estimator with 1651 features.
Fitting estimator with 1646 features.
Fitting estimator with 1641 features.
Fitting estimator with 1636 features.
Fitting estimator with 1631 features.
Fitting estimator with 1626 features.
Fitting estimator with 1621 features.
Fitting estimator with 1616 features.
Fitting estimator with 1611 features.
Fitting estimator with 1606 features.
Fitting estimator with 1601 features.
Fitting estimator with 1596 features.
Fitting estimator with 1591 features.
Fitting estimator with 1586 features.
Fitting estimator with 1581 features.
Fitting estimator with 1576 features.
Fitting estimator with 1571 features.
Fitting estimator with 1566 features.
Fitting estimator with 1561 features.
Fitting estimator with 1556 features.
Fitting estimator with 1551 features.
Fitting estimator with 1546 features.
Fitting estimator with 1541 features.
Fitting estimator with 1536 features.
Fitting estimator with 1531 features.
Fitting estimator with 1526 features.
Fitting estimator with 1521 features.
Fitting estimator with 1516 features.
Fitting estimator with 1511 features.
Fitting estimator with 1506 features.
Fitting estimator with 1501 features.
Fitting estimator with 1496 features.
Fitting estimator with 1491 features.
Fitting estimator with 1486 features.
Fitting estimator with 1481 features.
Fitting estimator with 1476 features.
Fitting estimator with 1471 features.
Fitting estimator with 1466 features.
Fitting estimator with 1461 features.
Fitting estimator with 1456 features.
Fitting estimator with 1451 features.
Fitting estimator with 1446 features.
Fitting estimator with 1441 features.
Fitting estimator with 1436 features.
Fitting estimator with 1431 features.
Fitting estimator with 1426 features.
Fitting estimator with 1421 features.
Fitting estimator with 1416 features.
Fitting estimator with 1411 features.
Fitting estimator with 1406 features.
Fitting estimator with 1401 features.
Fitting estimator with 1396 features.
Fitting estimator with 1391 features.
Fitting estimator with 1386 features.
Fitting estimator with 1381 features.
Fitting estimator with 1376 features.
Fitting estimator with 1371 features.
Fitting estimator with 1366 features.
Fitting estimator with 1361 features.
Fitting estimator with 1356 features.
Fitting estimator with 1351 features.
Fitting estimator with 1346 features.
Fitting estimator with 1341 features.
Fitting estimator with 1336 features.
Fitting estimator with 1331 features.
Fitting estimator with 1326 features.
Fitting estimator with 1321 features.
Fitting estimator with 1316 features.
Fitting estimator with 1311 features.
Fitting estimator with 1306 features.
Fitting estimator with 1301 features.
Fitting estimator with 1296 features.
Fitting estimator with 1291 features.
Fitting estimator with 1286 features.
Fitting estimator with 1281 features.
Fitting estimator with 1276 features.
Fitting estimator with 1271 features.
Fitting estimator with 1266 features.
Fitting estimator with 1261 features.
Fitting estimator with 1256 features.
Fitting estimator with 1251 features.
Fitting estimator with 1246 features.
Fitting estimator with 1241 features.
Fitting estimator with 1236 features.
Fitting estimator with 1231 features.
Fitting estimator with 1226 features.
Fitting estimator with 1221 features.
Fitting estimator with 1216 features.
Fitting estimator with 1211 features.
Fitting estimator with 1206 features.
Fitting estimator with 1201 features.
Fitting estimator with 1196 features.
Fitting estimator with 1191 features.
Fitting estimator with 1186 features.
Fitting estimator with 1181 features.
Fitting estimator with 1176 features.
Fitting estimator with 1171 features.
Fitting estimator with 1166 features.
Fitting estimator with 1161 features.
Fitting estimator with 1156 features.
Fitting estimator with 1151 features.
Fitting estimator with 1146 features.
Fitting estimator with 1141 features.
Fitting estimator with 1136 features.
Fitting estimator with 1131 features.
Fitting estimator with 1126 features.
Fitting estimator with 1121 features.
Fitting estimator with 1116 features.
Fitting estimator with 1111 features.
Fitting estimator with 1106 features.
Fitting estimator with 1101 features.
Fitting estimator with 1096 features.
Fitting estimator with 1091 features.
Fitting estimator with 1086 features.
Fitting estimator with 1081 features.
Fitting estimator with 1076 features.
Fitting estimator with 1071 features.
Fitting estimator with 1066 features.
Fitting estimator with 1061 features.
Fitting estimator with 1056 features.
Fitting estimator with 1051 features.
Fitting estimator with 1046 features.
Fitting estimator with 1041 features.
Fitting estimator with 1036 features.
Fitting estimator with 1031 features.
Fitting estimator with 1026 features.
Fitting estimator with 1021 features.
Fitting estimator with 1016 features.
Fitting estimator with 1011 features.
Fitting estimator with 1006 features.
Fitting estimator with 1001 features.
Fitting estimator with 996 features.
Fitting estimator with 991 features.
Fitting estimator with 986 features.
Fitting estimator with 981 features.
Fitting estimator with 976 features.
Fitting estimator with 971 features.
Fitting estimator with 966 features.
Fitting estimator with 961 features.
Fitting estimator with 956 features.
Fitting estimator with 951 features.
Fitting estimator with 946 features.
Fitting estimator with 941 features.
Fitting estimator with 936 features.
Fitting estimator with 931 features.
Fitting estimator with 926 features.
Fitting estimator with 921 features.
Fitting estimator with 916 features.
Fitting estimator with 911 features.
Fitting estimator with 906 features.
Fitting estimator with 901 features.
Fitting estimator with 896 features.
Fitting estimator with 891 features.
Fitting estimator with 886 features.
Fitting estimator with 881 features.
Fitting estimator with 876 features.
Fitting estimator with 871 features.
Fitting estimator with 866 features.
Fitting estimator with 861 features.
Fitting estimator with 856 features.
Fitting estimator with 851 features.
Fitting estimator with 846 features.
Fitting estimator with 841 features.
Fitting estimator with 836 features.
Fitting estimator with 831 features.
Fitting estimator with 826 features.
Fitting estimator with 821 features.
Fitting estimator with 816 features.
Fitting estimator with 811 features.
Fitting estimator with 806 features.
Fitting estimator with 801 features.
Fitting estimator with 796 features.
Fitting estimator with 791 features.
Fitting estimator with 786 features.
Fitting estimator with 781 features.
Fitting estimator with 776 features.
Fitting estimator with 771 features.
Fitting estimator with 766 features.
Fitting estimator with 761 features.
Fitting estimator with 756 features.
Fitting estimator with 751 features.
Fitting estimator with 746 features.
Fitting estimator with 741 features.
Fitting estimator with 736 features.
Fitting estimator with 731 features.
Fitting estimator with 726 features.
Fitting estimator with 721 features.
Fitting estimator with 716 features.
Fitting estimator with 711 features.
Fitting estimator with 706 features.
Fitting estimator with 701 features.
Fitting estimator with 696 features.
Fitting estimator with 691 features.
Fitting estimator with 686 features.
Fitting estimator with 681 features.
Fitting estimator with 676 features.
Fitting estimator with 671 features.
Fitting estimator with 666 features.
Fitting estimator with 661 features.
Fitting estimator with 656 features.
Fitting estimator with 651 features.
Fitting estimator with 646 features.
Fitting estimator with 641 features.
Fitting estimator with 636 features.
Fitting estimator with 631 features.
Fitting estimator with 626 features.
Fitting estimator with 621 features.
Fitting estimator with 616 features.
Fitting estimator with 611 features.
Fitting estimator with 606 features.
Fitting estimator with 601 features.
Fitting estimator with 596 features.
Fitting estimator with 591 features.
Fitting estimator with 586 features.
Fitting estimator with 581 features.
Fitting estimator with 576 features.
Fitting estimator with 571 features.
Fitting estimator with 566 features.
Fitting estimator with 561 features.
Fitting estimator with 556 features.
Fitting estimator with 551 features.
Fitting estimator with 546 features.
Fitting estimator with 541 features.
Fitting estimator with 536 features.
Fitting estimator with 531 features.
Fitting estimator with 526 features.
Fitting estimator with 521 features.
Fitting estimator with 516 features.
Fitting estimator with 511 features.
Fitting estimator with 506 features.
Fitting estimator with 501 features.
Fitting estimator with 496 features.
Fitting estimator with 491 features.
Fitting estimator with 486 features.
Fitting estimator with 481 features.
Fitting estimator with 476 features.
Fitting estimator with 471 features.
Fitting estimator with 466 features.
Fitting estimator with 461 features.
Fitting estimator with 456 features.
Fitting estimator with 451 features.
Fitting estimator with 446 features.
Fitting estimator with 441 features.
Fitting estimator with 436 features.
Fitting estimator with 431 features.
Fitting estimator with 426 features.
Fitting estimator with 421 features.
Fitting estimator with 416 features.
Fitting estimator with 411 features.
Fitting estimator with 406 features.
Fitting estimator with 401 features.
Fitting estimator with 396 features.
Fitting estimator with 391 features.
Fitting estimator with 386 features.
Fitting estimator with 381 features.
Fitting estimator with 376 features.
Fitting estimator with 371 features.
Fitting estimator with 366 features.
Fitting estimator with 361 features.
Fitting estimator with 356 features.
Fitting estimator with 351 features.
Fitting estimator with 346 features.
Fitting estimator with 341 features.
Fitting estimator with 336 features.
Fitting estimator with 331 features.
Fitting estimator with 326 features.
Fitting estimator with 321 features.
Fitting estimator with 316 features.
Fitting estimator with 311 features.
Fitting estimator with 306 features.
Fitting estimator with 301 features.
Fitting estimator with 296 features.
Fitting estimator with 291 features.
Fitting estimator with 286 features.
Fitting estimator with 281 features.
Fitting estimator with 276 features.
Fitting estimator with 271 features.
Fitting estimator with 266 features.
Fitting estimator with 261 features.
Fitting estimator with 256 features.
Fitting estimator with 251 features.
Fitting estimator with 246 features.
Fitting estimator with 241 features.
Fitting estimator with 236 features.
Fitting estimator with 231 features.
Fitting estimator with 226 features.
Fitting estimator with 221 features.
Fitting estimator with 216 features.
Fitting estimator with 211 features.
Fitting estimator with 206 features.
Fitting estimator with 201 features.
Fitting estimator with 196 features.
Fitting estimator with 191 features.
Fitting estimator with 186 features.
Fitting estimator with 181 features.
Fitting estimator with 176 features.
Fitting estimator with 171 features.
Fitting estimator with 166 features.
Fitting estimator with 161 features.
Fitting estimator with 156 features.
Fitting estimator with 151 features.
Fitting estimator with 146 features.
Fitting estimator with 141 features.
Fitting estimator with 136 features.
Fitting estimator with 131 features.
Fitting estimator with 126 features.
Fitting estimator with 121 features.
Fitting estimator with 116 features.
Fitting estimator with 111 features.
Fitting estimator with 106 features.
Fitting estimator with 101 features.
In [23]:
# Compute the reduction percentage of the feature matrix
reduction_percentage = ((len(motifs.instances) - n_features) / len(motifs.instances) * 100)
# Print the reduction percentage
print("\nReduction percentage:", round(reduction_percentage, 2), "%")
Reduction percentage: 97.56 %
In [24]:
# Initialize the table that will contain the selected features
instances = []
# Save selected k-mers features
for i, mask in enumerate(rfe.support_): 
    if mask == True: instances.append(motifs.instances[i])
# Save table as biopython motifs object
features = create(instances)
In [25]:
##############################################
#####    TRAINING DATA VISUALISATION     #####
##############################################
print("\n     TRAINING DATA VISUALISATION     ")
print("=====================================")
     TRAINING DATA VISUALISATION     
=====================================
In [26]:
# Define the function to draw Scatter Plot
def generateScatterPlot(title, figure_width, figure_height, data, X, y):
    # If 2d dimensions
    if n_components == 2:
        # Initialize a 2-dimensional figure
        fig, ax = plt.subplots(figsize=(figure_width, figure_height))
    # If 3d dimensions
    else:
        # Initialize a 3-dimensional figure
        fig = plt.figure(figsize=(15, 10))
        ax = Axes3D(fig)
    # List of markers
    markers = ["o","+", "^", "x"]
    # List of colors
    colors = ["tab:blue", "tab:orange", 
              "tab:green", "tab:red", 
              "tab:purple", "tab:brown", 
              "tab:pink", "tab:grey", 
              "tab:olive", "tab:cyan",]
    
    # Iterate through the targets
    for i, target in enumerate(y):
        # Set the list of axis positions
        x = []
        y = []
        z = []
        # If the number of targets is less than 10
        if i < 10:
            color = colors[i]
            marker = markers[0]
        # If the number of targets is less than 20
        elif i < 20:
            color = colors[i-10]
            marker = markers[1]
        # If the number of targets is less than 30
        elif i < 30:
            color = colors[i-20]
            marker = markers[2]
        # If the number of targets is less than 40
        else:
            color = colors[i-30]
            marker = markers[3]
            
        # Iterate through the data
        for i, d in enumerate(data):
            # If the sequence belongs to the target of interest
            if d.annotations["target"] == target:
                # Save the value of the positions
                x.append(X[i][0])
                y.append(X[i][1])
                if n_components == 3: z.append(X[i][2])
              
        # Add the current scatter plot to the figure
        if n_components == 2:
            ax.scatter(x, y, c = color, label = target, alpha = 0.75, edgecolors = 'none', marker=marker)
        else:
            ax.scatter(x, y, z, c = color, label=target,alpha=0.75, edgecolors='none', marker=marker)

    # Display the grid
    ax.grid(True)
    # Set the legend parameters
    ax.legend(loc = 2, prop = {'size': 10})
    # Set the tite
    plt.title(title)
    # Set axes labels
    if n_components == 2:
        plt.xlabel('PC1')
        plt.ylabel('PC2')
    else: 
        ax.set_xlabel('PC1')
        ax.set_ylabel('PC2')
        ax.set_zlabel('PC3')
    # Displqy the figure
    plt.show()
In [27]:
# Instantiate a TSNE with 3 principal components
tsne = TSNE(n_components = 3, perplexity = 50, verbose=True)
# Apply TSNE to X_train
x_tsne = tsne.fit_transform(x_train)
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 11136 samples in 0.005s...
[t-SNE] Computed neighbors for 11136 samples in 4.670s...
[t-SNE] Computed conditional probabilities for sample 1000 / 11136
[t-SNE] Computed conditional probabilities for sample 2000 / 11136
[t-SNE] Computed conditional probabilities for sample 3000 / 11136
[t-SNE] Computed conditional probabilities for sample 4000 / 11136
[t-SNE] Computed conditional probabilities for sample 5000 / 11136
[t-SNE] Computed conditional probabilities for sample 6000 / 11136
[t-SNE] Computed conditional probabilities for sample 7000 / 11136
[t-SNE] Computed conditional probabilities for sample 8000 / 11136
[t-SNE] Computed conditional probabilities for sample 9000 / 11136
[t-SNE] Computed conditional probabilities for sample 10000 / 11136
[t-SNE] Computed conditional probabilities for sample 11000 / 11136
[t-SNE] Computed conditional probabilities for sample 11136 / 11136
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 77.041931
[t-SNE] KL divergence after 1000 iterations: 1.081706
In [28]:
# Generate scatter plot of a TSNE
generateScatterPlot(title= "Scatter plot of a two-dimensional TSNE applied to the training data", 
                    figure_width = 15, 
                    figure_height = 12, 
                    data = train_data, 
                    X = x_tsne, 
                    y = set(y_train))
2021-05-03T13:56:49.260327 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [29]:
# Instantiate PCA with 3 principal components
pca = PCA(n_components = 3)
x_pca =  pca.fit_transform(x_train)
In [30]:
# Generate scatter plot of a PCA
generateScatterPlot(title= "Scatter plot of a two-dimensional PCA applied to the training data", 
                    figure_width = 15, 
                    figure_height = 12, 
                    data = train_data, 
                    X = x_pca, 
                    y = set(y_train))
2021-05-03T13:56:51.301427 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [31]:
##############################################
#####   MODEL TRAINING AND PREDICTION    #####
##############################################
print("\n    MODEL TRAINING AND PREDICTION    ")
print("=====================================")
    MODEL TRAINING AND PREDICTION    
=====================================
In [32]:
# Fit the model on the train set
model.fit(x_train, y_train)
# Save the model to filename model_name
joblib.dump(model, model_name)
Out[32]:
['k-mers-6.pkl']
In [33]:
# Predict with model on the test set
y_pred = model.predict(x_test)
# Display prediction
print("Predictions (" + str(len(y_pred)) + "):", y_pred)
Predictions (2780): ['NAM' 'AFR' 'AFR' ... 'NAM' 'EUR' 'AFR']
In [34]:
##############################################
#####  MODEL PREDICTIONS VISUALISATION   #####
##############################################
print("\n   MODEL PREDICTIONS VISUALISATION   ")
print("=====================================")
   MODEL PREDICTIONS VISUALISATION   
=====================================
In [35]:
# Will contain correct and incorrect data seq_records objects
correct_data = []
incorrect_data = []
# Will contain correct and incorrect features vectors (just like x_test)
correct_features = []
incorrect_features = []
# Iterate through test data
for i, d in enumerate(test_data):
    # Add an annotation to all test data stating its percentage range of ACGT characters
    total_char = len(d.seq)
    total_acgt = 0
    for char in d.seq:
        if re.match('^[ACGT]+$', char):
            total_acgt += 1
    acgt_percent = total_acgt / total_char
    if acgt_percent >= 0.75: d.annotations["acgt-percent"] = "75-100"
    elif acgt_percent >= 0.50: d.annotations["acgt-percent"] = "50-75"
    elif acgt_percent >= 0.25: d.annotations["acgt-percent"] = "25-50"
    else: d.annotations["acgt-percent"] = "0-25"
    # Split test data into correct and incorrect sets depending on prediction results
    if y_pred[i] == d.annotations["target"]:
        correct_data.append(d)
        correct_features.append(x_test[i])
    else:
        # If it's incorrect, add the prediction class as an annotation
        d.annotations["prediction"] = y_pred[i]
        incorrect_data.append(d)
        incorrect_features.append(x_test[i])
In [36]:
# Print the classification_report
print(classification_report(y_test, y_pred, digits = 3))
              precision    recall  f1-score   support

         AFR      0.897     0.927     0.912       675
         ASI      0.894     0.955     0.923       511
         CRB      0.667     0.250     0.364         8
         EUR      0.726     0.320     0.444       356
         FUS      0.977     0.894     0.933        47
         MEA      0.833     0.625     0.714         8
         NAM      0.827     0.985     0.899      1044
         OCE      0.250     0.125     0.167         8
         SAM      0.800     0.520     0.631       123

    accuracy                          0.853      2780
   macro avg      0.763     0.622     0.665      2780
weighted avg      0.843     0.853     0.833      2780

In [37]:
# Dictonaries with pair of annotation -> number of incorrect records with this annotation
subtypes = {}
countries = {}
predictions = {}
acgt_percents = {}
# Iterate through incorrect data
for i in incorrect_data:
    # Increment each kind of annotation with current record values as keys
    subtypes[i.annotations["subtype"]] = subtypes.get(i.annotations["subtype"], 0) + 1
    countries[i.annotations["country"]] = countries.get(i.annotations["country"], 0) + 1
    predictions[i.annotations["prediction"]] = predictions.get(i.annotations["prediction"], 0) + 1
    acgt_percents[i.annotations["acgt-percent"]] = acgt_percents.get(i.annotations["acgt-percent"], 0) + 1
# Display number of incorrect records for each annotation, useful to spot any pattern here
print("Incorrect predictions annotations:")
print("Subtype:", subtypes)
print("Country:", countries)
print("Prediction:", predictions)
print("ACGT percent:", acgt_percents)
Incorrect predictions annotations:
Subtype: {'C': 33, 'B': 225, 'A1': 16, '71_BF1': 1, '93_cpx': 1, '-': 24, 'BF1': 8, 'A1D': 1, '47_BF': 1, 'D': 2, 'F1': 4, 'A1C': 1, 'L': 1, '01_AE': 7, '02G': 5, '50_A1D': 1, 'A6': 1, '53_01B': 1, '23_BG': 1, '11_cpx': 3, 'AD': 1, '43_02G': 1, 'F2O': 1, '06_cpx': 1, 'BF': 1, '02_AG': 9, 'CD': 1, '56_cpx': 2, '46_BF': 1, '06A3': 1, 'DO': 1, '12_BF': 2, 'BCF1': 1, '01F1G': 1, '45_cpx': 1, 'O': 7, 'K': 1, 'BC': 1, '16_A2D': 1, 'H': 3, '02A1': 3, 'G': 9, '49_cpx': 1, '66_BF1': 1, '18_cpx': 2, '50B': 1, '02B': 1, '01B': 2, 'U': 1, '89_BF': 1, 'A1GH': 1, 'GKU': 1, 'BG': 1, '22F2': 1, 'P': 1, 'N': 1, '37_cpx': 1, 'BF1G': 1, 'DJ': 1, '54_01B': 1, '27_cpx': 1, '20_BG': 1, '32_06A6': 1}
Country: {'BE': 16, 'BR': 37, 'YE': 1, 'DO': 2, 'DE': 63, 'ES': 36, 'TZ': 2, 'ZA': 6, 'CH': 16, 'IN': 2, 'GB': 48, 'AR': 8, 'FR': 14, 'PE': 12, 'CD': 7, 'KE': 2, 'SE': 18, 'CY': 19, 'PL': 1, 'AU': 7, 'US': 16, 'CM': 9, 'RU': 5, 'MY': 3, 'CU': 4, 'ET': 5, 'NG': 7, 'BW': 4, 'SA': 1, 'GW': 2, 'PK': 5, 'UY': 2, 'JP': 2, 'KR': 2, 'GH': 1, 'ID': 2, 'EE': 2, 'IT': 3, 'GM': 1, 'AO': 1, 'RW': 1, 'CN': 3, 'DK': 4, 'TH': 3, 'GR': 1, 'IL': 1, 'CF': 1, 'MO': 1, 'NL': 1}
Prediction: {'AFR': 72, 'NAM': 215, 'MEA': 1, 'EUR': 43, 'ASI': 58, 'SAM': 16, 'CRB': 1, 'OCE': 3, 'FUS': 1}
ACGT percent: {'75-100': 410}
In [38]:
# Compute the confusion matrix
matrix = confusion_matrix(y_true = y_test, y_pred = y_pred)
# Build the heatmap
fig, ax = plt.subplots(figsize=(15, 10))
sns.heatmap(matrix, 
            cmap = 'Blues', 
            annot = True, 
            fmt = ".0f", 
            linewidth = 0.1, 
            xticklabels = targets.keys(), 
            yticklabels = targets.keys())
plt.title("Confusion matrix")
plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.show()
2021-05-03T13:57:30.700688 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [39]:
# Show percentage of occurence of all features for all target classes in both train and test data
matrix = []
# Iterate through features
for i, feature in enumerate(features.instances):
    # Generate an empty dictionary
    x = {}
    # Initialize the dictionary with targets as keys and 0 as value
    x = x.fromkeys(targets.keys(), 0)
    # Count in all train data
    for f, d in zip(x_train, train_data):
        if f[i] > 0: x[d.annotations["target"]] += 1
    # Count in all test data
    for f, d in zip(x_test, test_data):
        if f[i] > 0: x[d.annotations["target"]] += 1
    # Vector of attendance percentage
    vector = []
    # Iterate through the number of instances and the number of occurrences
    for n_instances, n_occurrences in zip(targets.values(), x.values()):
        n_instances = min(n_instances, n_samples)
        # Compute the percentage of k-mers attendance by target
        attendance_percentage = 100 - ((n_instances - n_occurrences) / n_instances * 100)
        # Save the attendance percentage in the specitic vector
        vector.append(int(attendance_percentage))
    # Save the vector of attendance percentage in the heatmap matrix
    matrix.append(vector)
# Build the heatmap
fig, ax = plt.subplots(figsize=(15, 20))
sns.heatmap(matrix, 
            annot = True, 
            fmt = ".0f", 
            cmap = 'Blues',
            linewidth = 0.1, 
            xticklabels = targets.keys(), 
            yticklabels = features.instances)
plt.title("Percentage of presence of k-mers according to HIV subtypes")
plt.xlabel("Target")
plt.ylabel("Features")
plt.show()
2021-05-03T13:57:36.001847 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [40]:
# For all incorrect records, compute average feature vectors of all correct records for both true and predicted classes
for i_data, i_features in zip(incorrect_data[0:max_incorrect], incorrect_features[0:max_incorrect]):
    # Both matrices to plot
    true_features = []
    pred_features = []
    # Iterate through correct records
    for c_data, c_features in zip(correct_data, correct_features):
        # Compare only if both records are somewhat similar (either same subtype or acgt-percentage range)
        if i_data.annotations["subtype"] == c_data.annotations["subtype"] or i_data.annotations["acgt-percent"] == c_data.annotations["acgt-percent"]:
            # If this correct record is in the same class as current incorrect record
            if i_data.annotations["target"] == c_data.annotations["target"]:
                true_features.append(c_features)
            # If this correct record is in the class that the current incorrect record has been predicted to  
            if i_data.annotations["prediction"] == c_data.annotations["target"]:
                pred_features.append(c_features)
    # Compute avergare matrices only if similar correct records are found (avoid div per 0)
    if len(true_features) != 0 and len(pred_features) != 0:
        true_features_mean = np.array(true_features).mean(axis=0)
        pred_features_mean = np.array(pred_features).mean(axis=0)
        # Build the heatmap
        fig, ax = plt.subplots(figsize=(40,5))
        sns.heatmap([true_features_mean, i_features, pred_features_mean], 
                #annot = True, 
                #fmt = ".0f", 
                linewidth = 0.1,
                cmap = 'Blues',
                xticklabels = features.instances,
                yticklabels = ["True", "Incorrect", "Prediction"],)
        plt.title("Comparaison of incorrect features vector with true and predicted features vectors averages")
        plt.xlabel("Features")
        plt.show()
2021-05-03T13:57:40.843874 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:57:43.173673 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:57:45.404709 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:57:47.512082 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:57:49.640361 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:57:51.858433 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:57:54.052568 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:57:56.191886 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:57:58.389984 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:58:00.629992 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [41]:
# For all incorrect records, compare apparence percentage of all correct records in both true and predicted classes
for i_data, i_features in zip(incorrect_data[0:max_incorrect], incorrect_features[0:max_incorrect]):
    # Dictionaries containing nb of occurences of features for all correct records
    true_features = {}
    pred_features = {}
    true_features = true_features.fromkeys(features.instances, 0)
    pred_features = pred_features.fromkeys(features.instances, 0)
    true_total = 0
    pred_total = 0
    # Iterate through correct records
    for c_data, c_features in zip(correct_data, correct_features):
        # Compare only if both records are somewhat similar (either same subtype or acgt-percentage range)
        #if i_data.annotations["subtype"] == c_data.annotations["subtype"] or i_data.annotations["acgt-percent"] == c_data.annotations["acgt-percent"]:
        # If this correct record is in the same class as current incorrect record
        if i_data.annotations["target"] == c_data.annotations["target"]:
            true_total += 1
            for value, key in zip(c_features, features.instances):
                if value > 0: true_features[key] += 1
        # If this correct record is in the class that the current incorrect record has been predicted to  
        if i_data.annotations["prediction"] == c_data.annotations["target"]:
            pred_total += 1
            for value, key in zip(c_features, features.instances):
                if value > 0: pred_features[key] += 1
    # Compute avergare matrices only if similar correct records are found (avoid div per 0)
    if true_total != 0 and pred_total != 0:
        true_vector = list(map((lambda i: i / true_total), true_features.values()))
        pred_vector = list(map((lambda i: i / pred_total), pred_features.values()))
        # Build the heatmap
        fig, ax = plt.subplots(figsize=(40,5))
        sns.heatmap([true_vector, i_features, pred_vector], 
                #annot = True, 
                #fmt = ".0f", 
                linewidth = 0.1,
                cmap = 'Blues',
                xticklabels = features.instances,
                yticklabels = ["True", "Incorrect", "Prediction"],)
        plt.title("Comparaison of incorrect features vector with true and predicted vectors of occurences percents")
        plt.xlabel("Features")
        plt.show()
2021-05-03T13:58:03.015615 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:58:05.458088 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:58:07.659231 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:58:09.915173 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:58:12.296838 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:58:14.476554 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:58:16.883148 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:58:19.119172 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:58:21.372122 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
2021-05-03T13:58:23.634079 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [42]:
# Compute alignement of all incorrect records to all correct record and compute avegarge of scores
print("\nComparison of alignement scores between true and predicted class:")
ids = []
matrix = []
# Used to show progress
progress = ProgressBar(max_value=len(incorrect_data[0:max_incorrect])*len(correct_data[0:max_correct])).start()
count = 0
# Shuffle correct data (when we're sampling it)
shuffle(correct_data)
# Iterate through incorrect data
for i in incorrect_data[0:max_incorrect]:
    # Keep different averages for same target class and predicted target class of incorrect record
    true_score_sum = 0
    true_score_nb = 0
    pred_score_sum = 0
    pred_score_nb = 0
    # Iterate through correct data
    for c in correct_data[0:max_correct]:
        # Compare only if both records are somewhat in the same category (both same subtype and acgt-percentage range)
        if i.annotations["subtype"] == c.annotations["subtype"] and i.annotations["acgt-percent"] == c.annotations["acgt-percent"]:
            # If this correct record is in the same class as current incorrect record
            if i.annotations["target"] == c.annotations["target"]:
                true_score_sum += pairwise2.align.globalxx(i.seq, c.seq, score_only=True)
                true_score_nb += 1
            # If this correct record is in the class that the current incorrect record has been predicted to
            if i.annotations["prediction"] == c.annotations["target"]:
                pred_score_sum += pairwise2.align.globalxx(i.seq, c.seq, score_only=True)
                pred_score_nb += 1
        # Used to show progress
        count += 1
        progress.update(count)
    # Compute avergare only if similar correct records are found (avoid div per 0)
    if true_score_nb != 0 and pred_score_nb != 0:
        ids.append(i.id)
        matrix.append([true_score_sum/true_score_nb, pred_score_sum/pred_score_nb])
# Normalise results
matrix = pd.DataFrame(np.array(matrix))
matrix = matrix.div(matrix.max(axis=1), axis=0)
# Build the heatmap
fig, ax = plt.subplots()
sns.heatmap(matrix, 
            #annot = True, 
            #fmt = ".0f", 
            linewidth = 0.1,
            cmap = 'Blues',
            xticklabels = ["True", "Prediction"], 
            yticklabels = ids)
plt.title("Comparison of alignement scores between true and predicted class")
plt.xlabel("Target")
plt.ylabel("ID")
plt.show()
  0% (1 of 10000) |                      | Elapsed Time: 0:00:01 ETA:   3:02:59
Comparison of alignement scores between true and predicted class:
 99% (9999 of 10000) |################## | Elapsed Time: 1:13:57 ETA:   0:00:01
2021-05-03T15:12:22.236661 image/svg+xml Matplotlib v3.4.1, https://matplotlib.org/
In [43]:
# Tried something, did not work yet...

#features = create(["GGCGG"])
#for i in incorrect_data:
#    graphic_features = []
#    progress = ProgressBar()
#    for pos, seq in progress(features.instances.search(i.seq)):
#        graphic_features.append(GraphicFeature(start = pos, end= pos + k, strand = +1, color= "#ffd700", label=str(seq + "\n" + "Position : " + str(pos))))
#    record = GraphicRecord(sequence_length = len(i.seq), features=graphic_features)
#    record.plot(figure_width = 15)
#    plt.title("Sequence : " + i.id) 
#    plt.show()
#for c in correct_data:
#    if c.annotations["target"] == "CRB":
#        graphic_features = []
#        progress = ProgressBar()
#        for pos, seq in progress(features.instances.search(i.seq)):
#            graphic_features.append(GraphicFeature(start = pos, end= pos + k, strand = +1, color= "#ffd700", label=str(seq + "\n" + "Position : " + str(pos))))
#        record = GraphicRecord(sequence_length = len(i.seq), features=graphic_features)
#        record.plot(figure_width = 15)
#        plt.title("Sequence : " + c.id) 
#        plt.show()
#        break
#for c in correct_data:
#    if c.annotations["target"] == "OCE":
#        graphic_features = []
#        progress = ProgressBar()
#        for pos, seq in progress(features.instances.search(i.seq)):
#            graphic_features.append(GraphicFeature(start = pos, end= pos + k, strand = +1, color= "#ffd700", label=str(seq + "\n" + "Position : " + str(pos))))
#        record = GraphicRecord(sequence_length = len(i.seq), features=graphic_features)
#        record.plot(figure_width = 15)
#        plt.title("Sequence : " + c.id) 
#        plt.show()
#        break